{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "yelp_path = '../../narre_all_data/yelp/yelp_dataset'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_user2num(min_num=20, max_num=100):\n",
    "    \"\"\"\n",
    "    根据评论数目加载user\n",
    "    min_num: 最小数目\n",
    "    max_num: 最大数目\n",
    "    \"\"\"\n",
    "    user2num = {}\n",
    "    with open(os.path.join(yelp_path, 'user.json'), 'r', encoding='utf-8') as f:\n",
    "        while True:\n",
    "            line = f.readline()\n",
    "            if not line:\n",
    "                break\n",
    "\n",
    "            user = json.loads(line)\n",
    "            id = user['user_id']\n",
    "            num = user['review_count']\n",
    "            \n",
    "            if num >= min_num and num <= max_num:\n",
    "                user2num[id] = num\n",
    "\n",
    "#     user2num = dict(sorted(user2num.items(), key=lambda item:item[1]))\n",
    "    return user2num"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_item2num(min_num=20, max_num=100):\n",
    "    \"\"\"\n",
    "    根据评论数目加载item\n",
    "    min_num: 最小数目\n",
    "    max_num: 最大数目\n",
    "    \"\"\"\n",
    "    item2num = {}\n",
    "    with open(os.path.join(yelp_path, 'business.json'), 'r', encoding='utf-8') as f:\n",
    "        while True:\n",
    "            line = f.readline()\n",
    "            if not line:\n",
    "                break\n",
    "\n",
    "            item = json.loads(line)\n",
    "            id = item['business_id']\n",
    "            num = item['review_count']\n",
    "\n",
    "            if num >= min_num and num <= max_num:\n",
    "                item2num[id] = num\n",
    "\n",
    "#     item2num = dict(sorted(item2num.items(), key=lambda item:item[1]))\n",
    "    return item2num"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_review(users, items, sample_num=200000):    \n",
    "    \"\"\"\n",
    "    根据users和items随机采样评论\n",
    "    users: \n",
    "    items: \n",
    "    sample_num: 采样数目\n",
    "    \"\"\"\n",
    "    datas = []\n",
    "    with open(os.path.join(yelp_path, 'review.json'), 'r', encoding='utf-8') as f:\n",
    "        while True:\n",
    "            line = f.readline()\n",
    "            if not line:\n",
    "                break\n",
    "\n",
    "            item = json.loads(line)\n",
    "            user_id = item['user_id']\n",
    "            item_id = item['business_id']\n",
    "            review = item['text']\n",
    "            rating = item['stars']\n",
    "            date = item['date']\n",
    "            \n",
    "            if date >= '2018':\n",
    "                continue\n",
    "            \n",
    "            if user_id in users and item_id in items:\n",
    "                datas.append({'reviewerID':user_id, 'asin':item_id, 'reviewText':review, 'overall':rating})\n",
    "    \n",
    "#     datas = random.sample(datas, sample_num)\n",
    "    return datas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "user2num = load_user2num()\n",
    "item2num = load_item2num()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "datas = load_review(user2num.keys(), item2num.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(min_review_num=5):\n",
    "    users = {}\n",
    "    items = {}\n",
    "    for data in datas:\n",
    "        num = users.setdefault(data['reviewerID'], 0) + 1\n",
    "        users[data['reviewerID']] = num\n",
    "        num = items.setdefault(data['asin'], 0) + 1\n",
    "        items[data['asin']] = num\n",
    "        \n",
    "    deleteUsers = [user for user,num in users.items() if num<min_review_num]\n",
    "    deleteItems = [item for item,num in items.items() if num<min_review_num]\n",
    "    \n",
    "    print('user num: %d, delete num: %d' % (len(users), len(deleteUsers)))\n",
    "    print('item num: %d, delete num: %d' % (len(items), len(deleteItems)))\n",
    "    \n",
    "    tempDatas = []\n",
    "    for data in datas:\n",
    "        user = data['reviewerID']\n",
    "        item = data['asin']\n",
    "        \n",
    "        if user not in deleteUsers and item not in deleteItems:\n",
    "            tempDatas.append(data)\n",
    "    return tempDatas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "user num: 110021, delete num: 96976\n",
      "item num: 43268, delete num: 23028\n"
     ]
    }
   ],
   "source": [
    "tempDatas = preprocess(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13045\n",
      "19745\n"
     ]
    }
   ],
   "source": [
    "print(len(list(set([data['reviewerID'] for data in tempDatas]))))\n",
    "print(len(list(set([data['asin'] for data in tempDatas]))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(os.path.join(yelp_path, '../', 'yelp.json'), 'w', encoding='utf-8') as f:\n",
    "    for data in tempDatas:\n",
    "        f.write(json.dumps(data) + '\\n')        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
